<!DOCTYPE html>



  


<html class="theme-next gemini use-motion" lang>
<head><meta name="generator" content="Hexo 3.8.0">
  <meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
<meta name="theme-color" content="#222">









<meta http-equiv="Cache-Control" content="no-transform">
<meta http-equiv="Cache-Control" content="no-siteapp">
















  
  
  <link href="/hcigmoid/lib/fancybox/source/jquery.fancybox.css?v=2.1.5" rel="stylesheet" type="text/css">




  
  
  
  

  
    
    
  

  
    
      
    

    
  

  

  

  

  
    
    
    <link href="//fonts.googleapis.com/css?family=Lato:300,300italic,400,400italic,700,700italic|Lato:300,300italic,400,400italic,700,700italic&subset=latin,latin-ext" rel="stylesheet" type="text/css">
  






<link href="/hcigmoid/lib/font-awesome/css/font-awesome.min.css?v=4.6.2" rel="stylesheet" type="text/css">

<link href="/hcigmoid/css/main.css?v=5.1.4" rel="stylesheet" type="text/css">


  <link rel="apple-touch-icon" sizes="180x180" href="/hcigmoid/images/apple-touch-icon-next.png?v=5.1.4">


  <link rel="icon" type="image/png" sizes="32x32" href="/hcigmoid/images/favicon-32x32-next.png?v=5.1.4">


  <link rel="icon" type="image/png" sizes="16x16" href="/hcigmoid/images/favicon-16x16-next.png?v=5.1.4">


  <link rel="mask-icon" href="/hcigmoid/images/logo.svg?v=5.1.4" color="#222">





  <meta name="keywords" content="weighted sample,有放回采样,alias method,无放回采样,A-ExpJ,">





  <link rel="alternate" href="/hcigmoid/atom.xml" title="HCigmoid" type="application/atom+xml">






<meta name="description" content="加权随机采样在推荐系统中随处可见，既可能用在模型训练数据处理过程中，也可能用于一些规则式的推荐策略里。典型的场景例如：  在新用户冷启动时，我们可以通过某些指标评估出内容的质量，并根据质量得分来将内容加权随机推荐给新用户，质量越高的内容，被曝光给新用户的概率也越大。 在样本采样时，有一种方法是对每条正样本，随机从所有的内容中选取 $k$ 个负样本，而每个内容被选为负样本的概率与其热度成正比（例如">
<meta name="keywords" content="weighted sample,有放回采样,alias method,无放回采样,A-ExpJ">
<meta property="og:type" content="article">
<meta property="og:title" content="加权随机采样">
<meta property="og:url" content="http://guyuecanhui.gitee.io/hcigmoid/2020/12/05/weighted-sample/index.html">
<meta property="og:site_name" content="HCigmoid">
<meta property="og:description" content="加权随机采样在推荐系统中随处可见，既可能用在模型训练数据处理过程中，也可能用于一些规则式的推荐策略里。典型的场景例如：  在新用户冷启动时，我们可以通过某些指标评估出内容的质量，并根据质量得分来将内容加权随机推荐给新用户，质量越高的内容，被曝光给新用户的概率也越大。 在样本采样时，有一种方法是对每条正样本，随机从所有的内容中选取 $k$ 个负样本，而每个内容被选为负样本的概率与其热度成正比（例如">
<meta property="og:locale" content="default">
<meta property="og:updated_time" content="2020-12-08T15:57:16.158Z">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="加权随机采样">
<meta name="twitter:description" content="加权随机采样在推荐系统中随处可见，既可能用在模型训练数据处理过程中，也可能用于一些规则式的推荐策略里。典型的场景例如：  在新用户冷启动时，我们可以通过某些指标评估出内容的质量，并根据质量得分来将内容加权随机推荐给新用户，质量越高的内容，被曝光给新用户的概率也越大。 在样本采样时，有一种方法是对每条正样本，随机从所有的内容中选取 $k$ 个负样本，而每个内容被选为负样本的概率与其热度成正比（例如">



<script type="text/javascript" id="hexo.configurations">
  var NexT = window.NexT || {};
  var CONFIG = {
    root: '/hcigmoid/',
    scheme: 'Gemini',
    version: '5.1.4',
    sidebar: {"position":"left","display":"post","offset":12,"b2t":false,"scrollpercent":false,"onmobile":false},
    fancybox: true,
    tabs: true,
    motion: {"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}},
    duoshuo: {
      userId: '0',
      author: '博主'
    },
    algolia: {
      applicationID: '',
      apiKey: '',
      indexName: '',
      hits: {"per_page":10},
      labels: {"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}
    }
  };
</script>



  <link rel="canonical" href="http://guyuecanhui.gitee.io/hcigmoid/2020/12/05/weighted-sample/">





  <title>加权随机采样 | HCigmoid</title>
  








</head>

<body itemscope itemtype="http://schema.org/WebPage" lang="default">

  
  
    
  

  <div class="container sidebar-position-left page-post-detail">
    <div class="headband"></div>

    <header id="header" class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-wrapper">
  <div class="site-meta ">
    

    <div class="custom-logo-site-title">
      <a href="/hcigmoid/" class="brand" rel="start">
        <span class="logo-line-before"><i></i></span>
        <span class="site-title">HCigmoid</span>
        <span class="logo-line-after"><i></i></span>
      </a>
    </div>
      
        <p class="site-subtitle">Watch, learn and practise</p>
      
  </div>

  <div class="site-nav-toggle">
    <button>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
    </button>
  </div>
</div>

<nav class="site-nav">
  

  
    <ul id="menu" class="menu">
      
        
        <li class="menu-item menu-item-home">
          <a href="/hcigmoid/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-home"></i> <br>
            
            首页
          </a>
        </li>
      
        
        <li class="menu-item menu-item-about">
          <a href="/hcigmoid/about/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-user"></i> <br>
            
            关于
          </a>
        </li>
      
        
        <li class="menu-item menu-item-tags">
          <a href="/hcigmoid/tags/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-tags"></i> <br>
            
            标签
          </a>
        </li>
      
        
        <li class="menu-item menu-item-categories">
          <a href="/hcigmoid/categories/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-th"></i> <br>
            
            分类
          </a>
        </li>
      
        
        <li class="menu-item menu-item-archives">
          <a href="/hcigmoid/archives/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-archive"></i> <br>
            
            归档
          </a>
        </li>
      

      
        <li class="menu-item menu-item-search">
          
            <a href="javascript:;" class="popup-trigger">
          
            
              <i class="menu-item-icon fa fa-search fa-fw"></i> <br>
            
            搜索
          </a>
        </li>
      
    </ul>
  

  
    <div class="site-search">
      
  <div class="popup search-popup local-search-popup">
  <div class="local-search-header clearfix">
    <span class="search-icon">
      <i class="fa fa-search"></i>
    </span>
    <span class="popup-btn-close">
      <i class="fa fa-times-circle"></i>
    </span>
    <div class="local-search-input-wrapper">
      <input autocomplete="off" placeholder="搜索..." spellcheck="false" type="text" id="local-search-input">
    </div>
  </div>
  <div id="local-search-result"></div>
</div>



    </div>
  
</nav>



 </div>
    </header>

    <main id="main" class="main">
      <div class="main-inner">
        <div class="content-wrap">
          <div id="content" class="content">
            

  <div id="posts" class="posts-expand">
    

  

  
  
  

  <article class="post post-type-normal" itemscope itemtype="http://schema.org/Article">
  
  
  
  <div class="post-block">
    <link itemprop="mainEntityOfPage" href="http://guyuecanhui.gitee.io/hcigmoid/hcigmoid/2020/12/05/weighted-sample/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="name" content="古月残辉">
      <meta itemprop="description" content>
      <meta itemprop="image" content="/hcigmoid/images/avatar.gif">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="HCigmoid">
    </span>

    
      <header class="post-header">

        
        
          <h1 class="post-title" itemprop="name headline">加权随机采样</h1>
        

        <div class="post-meta">
          <span class="post-time">
            
              <span class="post-meta-item-icon">
                <i class="fa fa-calendar-o"></i>
              </span>
              
                <span class="post-meta-item-text">发表于</span>
              
              <time title="创建于" itemprop="dateCreated datePublished" datetime="2020-12-05T22:54:16+08:00">
                2020-12-05
              </time>
            

            

            
          </span>

          
            <span class="post-category">
            
              <span class="post-meta-divider">|</span>
            
              <span class="post-meta-item-icon">
                <i class="fa fa-folder-o"></i>
              </span>
              
                <span class="post-meta-item-text">分类于</span>
              
              
                <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
                  <a href="/hcigmoid/categories/数学/" itemprop="url" rel="index">
                    <span itemprop="name">数学</span>
                  </a>
                </span>

                
                
              
            </span>
          

          
            
              <span class="post-comments-count">
                <span class="post-meta-divider">|</span>
                <span class="post-meta-item-icon">
                  <i class="fa fa-comment-o"></i>
                </span>
                <a href="/hcigmoid/2020/12/05/weighted-sample/#comments" itemprop="discussionUrl">
                  <span class="post-comments-count valine-comment-count" data-xid="/hcigmoid/2020/12/05/weighted-sample/" itemprop="commentCount"></span>
                </a>
              </span>
            
          

          
          

          
            <span class="post-meta-divider">|</span>
            <span class="page-pv"><i class="fa fa-file-o"></i> 阅读数
            <span class="busuanzi-value" id="busuanzi_value_page_pv"></span>
            </span>
          

          
            <div class="post-wordcount">
              
                
                <span class="post-meta-item-icon">
                  <i class="fa fa-file-word-o"></i>
                </span>
                
                  <span class="post-meta-item-text">字数统计&#58;</span>
                
                <span title="字数统计">
                  2.7k
                </span>
              

              
                <span class="post-meta-divider">|</span>
              

              
                <span class="post-meta-item-icon">
                  <i class="fa fa-clock-o"></i>
                </span>
                
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                
                <span title="阅读时长">
                  10
                </span>
              
            </div>
          

          

        </div>
      </header>
    

    
    
    
    <div class="post-body" itemprop="articleBody">

      
      

      
        <p>加权随机采样在推荐系统中随处可见，既可能用在模型训练数据处理过程中，也可能用于一些规则式的推荐策略里。典型的场景例如：</p>
<ol>
<li>在新用户冷启动时，我们可以通过某些指标评估出内容的质量，并根据质量得分来将内容加权随机推荐给新用户，质量越高的内容，被曝光给新用户的概率也越大。</li>
<li>在样本采样时，有一种方法是对每条正样本，随机从所有的内容中选取 $k$ 个负样本，而每个内容被选为负样本的概率与其热度成正比（例如 <strong>word2vec</strong> 的 <strong>negative sampling</strong> 技术）。</li>
<li>基于用户历史行为可以构建内容的有向图，当用户行为较稀疏时，我们可以使用 <strong>deepwalk</strong> 之类的算法在图中随机游走，生成内容的序列，再基于 <strong>word2vec</strong> 等算法生成这些内容的 embedding。在随机游走时，比如当前到达节点 $v$，那么下一次游走到其他节点的概率，与有向边的权重正相关。</li>
</ol>
<p>以上这些加权采样的场景往往都不可避免的面对大数据量挑战，因此对性能要求较高。而再细分一下，上面的场景 1, 2 都属于无放回采样，也就是需要采样的内容都不相同；而场景 3 则属于有放回采样，即允许采到相同的样本。本文就这两类加权随机采样问题分别探讨高效的解法。</p>
<a id="more"></a>
<h3 id="常规方案——基数法"><a href="#常规方案——基数法" class="headerlink" title="常规方案——基数法"></a>常规方案——基数法</h3><p>先再把问题抽象并更正式一点的描述一下：假设列表 $S$ 中有 $n$ 个元素，初始时，已知每个元素 $i$ 被抽取的概率为 <script type="math/tex">p_i: \sum_{i=1}^n p_i=1</script>，则：</p>
<ol>
<li>无放回加权随机采样是指，随机从 $S$ 中不放回的抽取 $m$ 个元素，每个元素 $i$ 被抽取的概率为 <script type="math/tex">q_i= \frac{p_i}{\sum_{j\in S'}p_j}</script>，其中 $S’$ 表示该次抽样时所有剩余元素的集合；</li>
<li>有放回加权随机采样是指，随机从 $S$ 中有放回的抽取 $m$ 个元素，每个元素 $i$ 被抽取的概率为 $p_i$。</li>
</ol>
<p>一般来讲，我们更希望获得一种时间复杂度低的抽样算法，也就是能够用尽可能少的步骤来得到 $m$ 个抽样的结果。在不考虑极致的精度和极端场景时，一种简单粗暴的方案是基于基数的抽样算法，这也是 <strong>word2vec</strong> 官方实现$^{[1]}$的策略：</p>
<ol>
<li>生成一个超大的数组 $A$，数组长度为 $k$，例如 $k=10^8$，或者 $k=\text{ceil}(1/\min(p_i))$；</li>
<li>将数组中的元素填充为集合元素的 index，元素 $i$ 的 index 被填充的数量为 <script type="math/tex">\lfloor k\cdot \sum_{j\le i} p_i\rfloor - \lfloor k\cdot \sum_{j< i} p_i\rfloor</script>；</li>
<li>每次生成随机整数 $r: 0\le r &lt; k$，并将 $A[r]$ 对应的元素作为采样结果；</li>
<li>对于无放回的情况，还需要记录已经采样的结果集合，如果采样的结果出现在集合中，则重新采样，直到得到 $m$ 个不同的采样结果。</li>
</ol>
<p>我们将这种方法称为基数法。这种方法在 $S$ 中元素的数量远远超过 $m$，并且每个元素被抽取的概率远远小于 $1$ 时有较高的效率，是一种空间换时间的策略。但是在极端情况下，例如某个元素 $i$ 的 $p_i$ 接近 $1$ 时，无放回的抽样效率可能极低，因为每次抽样绝大概率抽到 $i$，进而需要重新抽样。另外，由于第 $2$ 步实际是一种近似策略，只能保证大体上数据 $A$ 中的元素 index 数量与其抽样概率成正相关，而不能保留精确的比例。因此，在需要更高效率和更高精度的条件下，我们可以考虑下面的方案。</p>
<h3 id="有放回加权随机采样——Alias-Method"><a href="#有放回加权随机采样——Alias-Method" class="headerlink" title="有放回加权随机采样——Alias Method"></a>有放回加权随机采样——Alias Method</h3><p>虽然基数法应用在有放回加权随机采样时，每次采样的时间复杂度为 $O(1)$，但是我们获得的实际上是一种非精确的采样，并且基数法的空间复杂度较高。相对而言，<strong>Alias Method</strong>$^{[2]}$ 是一种空间和时间都极为高效的算法，它的主要流程分为初始化和采样两步：首先用 $O(n)$ 的复杂度初始化两个长度为 $n$ 的数组，然后基于这两个数组进行采样，每次采样的复杂度为 $O(1)$。下面详细介绍下这种方法的流程。</p>
<p>首先将所有元素的抽样概率都乘以 $n$，得到一个平均概率为 $1$ 的新的采样概率 $Q=[q_1, q_2, \cdots,q_n]$，然后对这些概率进行两两组合：</p>
<ol>
<li>选择一个概率不超过 $1$ 的元素 $i$：$q_i\le 1$，设置 $Prob[i]=q_i$；</li>
<li>选择一个概率不小于 $1$ 的元素 $j$：$q_j\ge 1$，设置 $Alias[i] = j$，并将 $q_j:=q_j-(1-q_i)$，即用 $q_j$ 来补足 $q_i$ 少于 $1$ 的部分；</li>
</ol>
<p>这样组合以后，我们得到了两个数组：原始概率 $Prob$ 和组合元素索引 $Alias$。从构造的过程中，我们可以发现这两个数组的长度都是 $n$，并且 $Prob$ 对应的类别顺序与 $P$ 一致，而 $Alias$ 则保存着跟原始类别进行概率组合的类别编号。进一步的，对于每个数组下标 $i$，它一定会对应一个原始的元素 $i$，以及至多一个组合元素 $Alias[i]$，这个组合元素是用来跟 $i$ 一起将概率凑成 $1$ 的。</p>
<p>基于这两个数组，在每次采样时，只需要生成两个随机数：</p>
<ol>
<li>第一个随机数范围是 $r_1\in[1,n]$，用于确定原始元素，假设 $r_1=i$；</li>
<li>第二个随机数范围是 $r_2\in [0,1]$，如果 $r_2&lt; Prob[i]$，本次采样的类别就是原始元素 $i$，否则本次采样的类别是其组合元素 $j=Alias[i]$；</li>
</ol>
<p>到这里，大家肯定会好奇，在两两组合概率时，是不是一定能找到一种方案，使得对于所有的元素 $i$，一定能为其找到最多一个组合类别，使得它们的概率之和为 $1$。答案当然是肯定的，我们可以用归纳法进行证明：</p>
<ol>
<li>当 $n=1$ 时，$Prob[1]=1$, $Alias[1]=null$，命题显然成立；</li>
<li>对于任意正整数 $k$，假设当 $n=k$ 时命题成立，则当 $n=k+1$ 时，我们一定能找到两个类别 $c_i, c_j$，满足 $q_i\le 1, q_j\ge 1$，则我们设置 $Prob[i]=q_i$，$Alias[i] = j$，$q_j:=q_j-(1-q_i)$ 后，得到除 $c_i$ 外的 $k$ 个类别，它们的平均概率仍为 $1$，因此根据假设，$n=k+1$ 时，命题仍然成立。</li>
</ol>
<p>因此，我们按上面的组合策略，一定能成功构造出一个 $Alias$ 和 $Prob$ 的数组。</p>
<p>下面提供了一个 <strong>python</strong> 版本的实现，仅供参考。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> numpy <span class="keyword">as</span> np</span><br><span class="line"><span class="keyword">import</span> numpy.random <span class="keyword">as</span> npr</span><br><span class="line"></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">AliasMethodSampling</span>:</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">__init__</span><span class="params">(self, p)</span>:</span></span><br><span class="line">        self.prob, self.alias = self._setup_alias(p)</span><br><span class="line">    </span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">_setup_alias</span><span class="params">(self, p)</span>:</span></span><br><span class="line">        small, large = [], []</span><br><span class="line">        n = len(p)</span><br><span class="line">        prob, alias = np.zeros(n), np.zeros(n, dtype=np.int)</span><br><span class="line">        <span class="comment"># init small and large array</span></span><br><span class="line">        <span class="keyword">for</span> i, pi <span class="keyword">in</span> enumerate(p):</span><br><span class="line">            prob[i] = pi * n</span><br><span class="line">            small.append(i) <span class="keyword">if</span> prob[i] &lt; <span class="number">1.0</span> <span class="keyword">else</span> large.append(i)</span><br><span class="line">        <span class="comment"># fill a small element each iteration</span></span><br><span class="line">        <span class="keyword">while</span> small <span class="keyword">and</span> large:</span><br><span class="line">            il, ig = small.pop(), large.pop()</span><br><span class="line">            alias[il] = ig</span><br><span class="line">            prob[ig] = (prob[ig] + prob[il]) - <span class="number">1</span></span><br><span class="line">            small.append(ig) <span class="keyword">if</span> prob[ig] &lt; <span class="number">1</span> <span class="keyword">else</span> large.append(ig)</span><br><span class="line">        <span class="comment"># handle numerical instability</span></span><br><span class="line">        <span class="keyword">while</span> large:</span><br><span class="line">            print(<span class="string">'only large exists'</span>, prob, alias, large, small)</span><br><span class="line">            prob[large.pop()] = <span class="number">1</span></span><br><span class="line">        <span class="keyword">while</span> small:</span><br><span class="line">            print(<span class="string">'only small exists'</span>, prob, alias, large, small)</span><br><span class="line">            prob[small.pop()] = <span class="number">1</span></span><br><span class="line">        <span class="keyword">return</span> prob, alias</span><br><span class="line">        </span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">take_sample</span><span class="params">(self)</span>:</span></span><br><span class="line">        i_rand = npr.randint(len(self.prob))</span><br><span class="line">        <span class="keyword">return</span> i_rand <span class="keyword">if</span> npr.rand() &lt; self.prob[i_rand] <span class="keyword">else</span> self.alias[i_rand]</span><br><span class="line">    </span><br><span class="line">    </span><br><span class="line">probs, cnt = [<span class="number">0.1</span>, <span class="number">0.5</span>, <span class="number">0.2</span>, <span class="number">0.05</span>, <span class="number">0.15</span>], np.zeros(<span class="number">5</span>)</span><br><span class="line">alias = AliasMethodSampling(probs)</span><br><span class="line"></span><br><span class="line"><span class="keyword">for</span> i <span class="keyword">in</span> range(<span class="number">1000000</span>):</span><br><span class="line">    cnt[alias.take_sample()] += <span class="number">1</span></span><br><span class="line">    </span><br><span class="line">cnt / <span class="number">1000000</span></span><br></pre></td></tr></table></figure>
<h3 id="无放回加权随机采样——A-ExpJ-算法"><a href="#无放回加权随机采样——A-ExpJ-算法" class="headerlink" title="无放回加权随机采样——A-ExpJ 算法"></a>无放回加权随机采样——A-ExpJ 算法</h3><p>在上面讨论的有放回加权随机采样情形中，每次抽样时，每个元素的抽样概率实际上是不发生变化的，我们也不需要知道之前抽样的结果是什么。但是在无放回情形中，每次抽到一个元素后，后面就不能再抽到相同的元素，也就意味着每次抽样后，元素的抽样概率会发生变化；并且我们需要记录之前抽样的结果有哪些，来防止抽到重复的元素。</p>
<p>前面也提到了，应用基数法来进行无放回加权随机采样的主要问题是可能重复抽到同一个元素，而生成随机数的成本其实是很高的，极端场景下，我们需要生成随机数的次数远大于 $n$。</p>
<p>文献 $[3]$ 提出了一种基于代理特征来采样的方法，也就是对于每个元素 $i$，我们选取服从均匀分布的随机数 $u_i=\text{rand}(0,1)$，用 $k_i=u_i^{1/p_i}$ 来作为采样的关键值，并选择关键值最大的 $m$ 个样本作为采样结果。这样，我们至多需要生成 $n$ 个随机数，就能完成采样过程。至于 $k_i$ 选择的正确性可以参考文献 $[3]$ 中的证明。基于这种代理特征的好处是，不需要知道每个元素的采样概率，只需要知道其权重即可 (也就是说不需要知道所有元素的总权重，无需做概率的归一化)，这就特别适合流式采样的场景。</p>
<p>当然，这里选择关键值最大的 $m$ 个样本可以采用最大堆来实现，这样就只需要进行一次全量 $O(n)$ 扫描，同时只要保留 $m$ 个当前最大的结果即可，这就是 <strong>A-Res</strong> 算法。在绝大多数场景中，<strong>A-Res</strong> 算法已经足够高效了。</p>
<p>为了进一步减少随机数生成的数量，作者提出了 <strong>A-ExpJ</strong> 算法，能将随机数的生成量从 $O(n)$ 减少到 $O(m\log⁡(\frac{n}{m})))$。实际上就是用计算复杂度比较低的反向计算来代替复杂度高的随机数生成，并直接跳过一些关键值明显较小的元素。具体步骤如下：</p>
<ol>
<li>将列表 $S$ 的前 $m$ 个元素放入结果集合 $R$；</li>
<li>对于结果集里的每个元素，计算关键值 $k_i=u_i^{(1/p_i)}$，其中 $u_i=\text{rand}(0,1)$；</li>
<li>将 $R$ 中小最的关键值记为阈值 $k_{min}$；</li>
<li>对剩下的元素重复以下步骤：<ol>
<li>令 $r=\text{rand}(0,1)$ 且 $x_p=\log(r)/\log(t)$；</li>
<li>从当前元素 $c$ 开始跳过元素，直到遇到元素 $i$，满足 <script type="math/tex">p_c+p_{c+1}+\cdots +p_{i−1}<x_p\le p_c+p_{c+1}+\cdots+p_{i−1}+p_i</script>；</li>
<li>使用 $i$ 替换 $R$ 中关键值最小的元素；</li>
<li>令 <script type="math/tex">t=k^{p_i}_{min}</script>, $r_2=\text{rand}(t,1)$, $i$ 的关键值 $k_i=r_2^{(1/p_i)}$；</li>
<li>令新的阈值 $k_{min}$ 为此时 $R$ 中的最小关键值。</li>
</ol>
</li>
</ol>
<p>下面提供了一种 <strong>python</strong> 实现，仅供参考。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> random</span><br><span class="line"><span class="keyword">import</span> math</span><br><span class="line"><span class="keyword">import</span> heapq</span><br><span class="line"><span class="keyword">import</span> numpy <span class="keyword">as</span> np</span><br><span class="line"></span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">a_expj_sample</span><span class="params">(prob, m)</span>:</span></span><br><span class="line">    <span class="string">""" 根据 prob 数组无放回随机抽取 m 个元素 """</span></span><br><span class="line">    topn = []</span><br><span class="line">    <span class="keyword">for</span> i, pi <span class="keyword">in</span> enumerate(prob[:m]):</span><br><span class="line">        heapq.heappush(topn, (random.random() ** (<span class="number">1</span>/pi), i))</span><br><span class="line">        </span><br><span class="line">    thres, w_sum = topn[<span class="number">0</span>][<span class="number">0</span>], <span class="number">0</span></span><br><span class="line">    xw = math.log(random.random()) / math.log(thres)</span><br><span class="line">    i = m</span><br><span class="line">    <span class="keyword">for</span> pi <span class="keyword">in</span> prob[m:]:</span><br><span class="line">        <span class="keyword">if</span> w_sum + pi &gt;= xw:</span><br><span class="line">            tw = thres ** pi</span><br><span class="line">            r2 = random.uniform(tw, <span class="number">1</span>)</span><br><span class="line">            ki = r2 ** (<span class="number">1</span>/pi)</span><br><span class="line">            heapq.heappop(topn)</span><br><span class="line">            heapq.heappush(topn, (ki, i))</span><br><span class="line">            thres = topn[<span class="number">0</span>][<span class="number">0</span>]</span><br><span class="line">            xw = math.log(random.random()) / math.log(thres)</span><br><span class="line">            w_sum = <span class="number">0</span></span><br><span class="line">        <span class="keyword">else</span>:</span><br><span class="line">            w_sum += pi</span><br><span class="line">        i += <span class="number">1</span></span><br><span class="line">    <span class="keyword">return</span> [item[<span class="number">1</span>] <span class="keyword">for</span> item <span class="keyword">in</span> topn]</span><br><span class="line"></span><br><span class="line"></span><br><span class="line">probs = [<span class="number">0.1</span>, <span class="number">0.5</span>, <span class="number">0.2</span>, <span class="number">0.05</span>, <span class="number">0.15</span>]</span><br><span class="line"></span><br><span class="line"><span class="keyword">for</span> k <span class="keyword">in</span> range(<span class="number">1</span>, <span class="number">4</span>):</span><br><span class="line">    cnt = np.zeros(<span class="number">5</span>)</span><br><span class="line">    <span class="keyword">for</span> i <span class="keyword">in</span> range(<span class="number">1000000</span>):</span><br><span class="line">        <span class="keyword">for</span> j <span class="keyword">in</span> a_expj_sample(probs, k):</span><br><span class="line">            cnt[j] += <span class="number">1</span></span><br><span class="line">    print(k, <span class="string">':'</span>, cnt / <span class="number">1000000</span> / k)</span><br></pre></td></tr></table></figure>
<h3 id="小结"><a href="#小结" class="headerlink" title="小结"></a>小结</h3><p>本文主要介绍了三种加权采样的算法，其中，有放回加权随机采样推荐使用 <strong>Alias Method</strong>，无放回加权随机采样推荐使用 <strong>A-ExpJ</strong> 或者 <strong>A-Res</strong>，如果对精度要求没那么高，并且样本权重呈现非极端的分布，也可以使用简单的基数法。本篇讨论的算法主要应用场景在于<strong>我们已知每个元素的采样权重，进而可以设置其采样概率</strong>。但是有些情况下，我们是<strong>不知道这些元素的分布是什么样</strong>，而我们希望从这些未知分布中抽取样本，再利用这些样本对目标做一个估计。这时就需要考虑重要性采样、马尔可夫链蒙特卡罗方法、<strong>Gibbs</strong> 采样等采样算法，后面有机会再讨论吧。</p>
<h3 id="参考文献"><a href="#参考文献" class="headerlink" title="参考文献"></a>参考文献</h3><p>[1] Github: word2vec. dav: <a href="https://github.com/dav/word2vec/blob/master/src/word2vec.c" target="_blank" rel="noopener">https://github.com/dav/word2vec/blob/master/src/word2vec.c</a>.</p>
<p>[2] Darts,Dice, and coids. Keith. 2011: <a href="http://www.keithschwarz.com/darts-dice-coins" target="_blank" rel="noopener">http://www.keithschwarz.com/darts-dice-coins</a>.</p>
<p>[3] Efraimidis, Pavlos S. , and P. G. Spirakis . “Weighted random sampling with a reservoir.” <em>Information Processing Letters</em>97.5(2006):181-185.</p>

      
    </div>
    
    
    

    

    

    

    <footer class="post-footer">
      
        <div class="post-tags">
          
            <a href="/hcigmoid/tags/weighted-sample/" rel="tag"># weighted sample</a>
          
            <a href="/hcigmoid/tags/有放回采样/" rel="tag"># 有放回采样</a>
          
            <a href="/hcigmoid/tags/alias-method/" rel="tag"># alias method</a>
          
            <a href="/hcigmoid/tags/无放回采样/" rel="tag"># 无放回采样</a>
          
            <a href="/hcigmoid/tags/A-ExpJ/" rel="tag"># A-ExpJ</a>
          
        </div>
      

      
      
      

      
        <div class="post-nav">
          <div class="post-nav-next post-nav-item">
            
              <a href="/hcigmoid/2020/07/29/paper-2018-hulu-dpp_map/" rel="next" title="Fast Greedy MAP for DPP 论文精读">
                <i class="fa fa-chevron-left"></i> Fast Greedy MAP for DPP 论文精读
              </a>
            
          </div>

          <span class="post-nav-divider"></span>

          <div class="post-nav-prev post-nav-item">
            
          </div>
        </div>
      

      
      
    </footer>
  </div>
  
  
  
  </article>



    <div class="post-spread">
      
    </div>
  </div>


          </div>
          


          

  
    <div class="comments" id="comments">
    </div>
  



        </div>
        
          
  
  <div class="sidebar-toggle">
    <div class="sidebar-toggle-line-wrap">
      <span class="sidebar-toggle-line sidebar-toggle-line-first"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-middle"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-last"></span>
    </div>
  </div>

  <aside id="sidebar" class="sidebar">
    
    <div class="sidebar-inner">

      

      
        <ul class="sidebar-nav motion-element">
          <li class="sidebar-nav-toc sidebar-nav-active" data-target="post-toc-wrap">
            文章目录
          </li>
          <li class="sidebar-nav-overview" data-target="site-overview-wrap">
            站点概览
          </li>
        </ul>
      

      <section class="site-overview-wrap sidebar-panel">
        <div class="site-overview">
          <div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
            
              <p class="site-author-name" itemprop="name">古月残辉</p>
              <p class="site-description motion-element" itemprop="description">总结心得</p>
          </div>

          <nav class="site-state motion-element">

            
              <div class="site-state-item site-state-posts">
              
                <a href="/hcigmoid/archives/">
              
                  <span class="site-state-item-count">32</span>
                  <span class="site-state-item-name">日志</span>
                </a>
              </div>
            

            
              
              
              <div class="site-state-item site-state-categories">
                <a href="/hcigmoid/categories/index.html">
                  <span class="site-state-item-count">6</span>
                  <span class="site-state-item-name">分类</span>
                </a>
              </div>
            

            
              
              
              <div class="site-state-item site-state-tags">
                <a href="/hcigmoid/tags/index.html">
                  <span class="site-state-item-count">78</span>
                  <span class="site-state-item-name">标签</span>
                </a>
              </div>
            

          </nav>

          
            <div class="feed-link motion-element">
              <a href="/hcigmoid/atom.xml" rel="alternate">
                <i class="fa fa-rss"></i>
                RSS
              </a>
            </div>
          

          
            <div class="links-of-author motion-element">
                
                  <span class="links-of-author-item">
                    <a href="mailto:guyuecanhui@icloud.com" target="_blank" title="E-Mail">
                      
                        <i class="fa fa-fw fa-envelope"></i>E-Mail</a>
                  </span>
                
            </div>
          

          
          

          
          

          

        </div>
      </section>

      
      <!--noindex-->
        <section class="post-toc-wrap motion-element sidebar-panel sidebar-panel-active">
          <div class="post-toc">

            
              
            

            
              <div class="post-toc-content"><ol class="nav"><li class="nav-item nav-level-3"><a class="nav-link" href="#常规方案——基数法"><span class="nav-number">1.</span> <span class="nav-text">常规方案——基数法</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#有放回加权随机采样——Alias-Method"><span class="nav-number">2.</span> <span class="nav-text">有放回加权随机采样——Alias Method</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#无放回加权随机采样——A-ExpJ-算法"><span class="nav-number">3.</span> <span class="nav-text">无放回加权随机采样——A-ExpJ 算法</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#小结"><span class="nav-number">4.</span> <span class="nav-text">小结</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#参考文献"><span class="nav-number">5.</span> <span class="nav-text">参考文献</span></a></li></ol></div>
            

          </div>
        </section>
      <!--/noindex-->
      

      

    </div>
  </aside>


        
      </div>
    </main>

    <footer id="footer" class="footer">
      <div class="footer-inner">
        <div class="copyright">&copy; 2018 &mdash; <span itemprop="copyrightYear">2020</span>
  <span class="with-love">
    <i class="fa fa-user"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">古月残辉</span>

  
    <span class="post-meta-divider">|</span>
    <span class="post-meta-item-icon">
      <i class="fa fa-area-chart"></i>
    </span>
    
      <span class="post-meta-item-text">Site words total count&#58;</span>
    
    <span title="Site words total count">58.2k</span>
  
</div>









<script async src="//busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js"></script>

        
<div class="busuanzi-count">
  <script async src="https://dn-lbstatics.qbox.me/busuanzi/2.3/busuanzi.pure.mini.js"></script>

  
    <span class="site-uv">
      <i class="fa fa-user"></i> 访问人数
      <span class="busuanzi-value" id="busuanzi_value_site_uv"></span>
      人
    </span>
  

  
    <span class="site-pv">
      <i class="fa fa-eye"></i> 总访问量
      <span class="busuanzi-value" id="busuanzi_value_site_pv"></span>
      次
    </span>
  
</div>








        
      </div>
    </footer>

    
      <div class="back-to-top">
        <i class="fa fa-arrow-up"></i>
        
      </div>
    

    

  </div>

  

<script type="text/javascript">
  if (Object.prototype.toString.call(window.Promise) !== '[object Function]') {
    window.Promise = null;
  }
</script>









  












  
  
    <script type="text/javascript" src="/hcigmoid/lib/jquery/index.js?v=2.1.3"></script>
  

  
  
    <script type="text/javascript" src="/hcigmoid/lib/fastclick/lib/fastclick.min.js?v=1.0.6"></script>
  

  
  
    <script type="text/javascript" src="/hcigmoid/lib/jquery_lazyload/jquery.lazyload.js?v=1.9.7"></script>
  

  
  
    <script type="text/javascript" src="/hcigmoid/lib/velocity/velocity.min.js?v=1.2.1"></script>
  

  
  
    <script type="text/javascript" src="/hcigmoid/lib/velocity/velocity.ui.min.js?v=1.2.1"></script>
  

  
  
    <script type="text/javascript" src="/hcigmoid/lib/fancybox/source/jquery.fancybox.pack.js?v=2.1.5"></script>
  


  


  <script type="text/javascript" src="/hcigmoid/js/src/utils.js?v=5.1.4"></script>

  <script type="text/javascript" src="/hcigmoid/js/src/motion.js?v=5.1.4"></script>



  
  


  <script type="text/javascript" src="/hcigmoid/js/src/affix.js?v=5.1.4"></script>

  <script type="text/javascript" src="/hcigmoid/js/src/schemes/pisces.js?v=5.1.4"></script>



  
  <script type="text/javascript" src="/hcigmoid/js/src/scrollspy.js?v=5.1.4"></script>
<script type="text/javascript" src="/hcigmoid/js/src/post-details.js?v=5.1.4"></script>



  


  <script type="text/javascript" src="/hcigmoid/js/src/bootstrap.js?v=5.1.4"></script>



  


  




	





  





  










  <script src="//cdn1.lncld.net/static/js/3.0.4/av-min.js"></script>
  <script src="//unpkg.com/valine/dist/Valine.min.js"></script>
  
  <script type="text/javascript">
    var GUEST = ['nick','mail','link'];
    var guest = 'nick,mail,link';
    guest = guest.split(',').filter(item=>{
      return GUEST.indexOf(item)>-1;
    });
    new Valine({
        el: '#comments' ,
        verify: false,
        notify: false,
        appId: '6du4Ppc2TvUuhcccRHSDNH2v-gzGzoHsz',
        appKey: 'zOKNml4W1Bq3OTzEuLt5hUjI',
        placeholder: '感谢阅读！欢迎评论！',
        avatar:'mm',
        guest_info:guest,
        pageSize:'10' || 10,
    });
  </script>



  

  <script type="text/javascript">
    // Popup Window;
    var isfetched = false;
    var isXml = true;
    // Search DB path;
    var search_path = "search.xml";
    if (search_path.length === 0) {
      search_path = "search.xml";
    } else if (/json$/i.test(search_path)) {
      isXml = false;
    }
    var path = "/hcigmoid/" + search_path;
    // monitor main search box;

    var onPopupClose = function (e) {
      $('.popup').hide();
      $('#local-search-input').val('');
      $('.search-result-list').remove();
      $('#no-result').remove();
      $(".local-search-pop-overlay").remove();
      $('body').css('overflow', '');
    }

    function proceedsearch() {
      $("body")
        .append('<div class="search-popup-overlay local-search-pop-overlay"></div>')
        .css('overflow', 'hidden');
      $('.search-popup-overlay').click(onPopupClose);
      $('.popup').toggle();
      var $localSearchInput = $('#local-search-input');
      $localSearchInput.attr("autocapitalize", "none");
      $localSearchInput.attr("autocorrect", "off");
      $localSearchInput.focus();
    }

    // search function;
    var searchFunc = function(path, search_id, content_id) {
      'use strict';

      // start loading animation
      $("body")
        .append('<div class="search-popup-overlay local-search-pop-overlay">' +
          '<div id="search-loading-icon">' +
          '<i class="fa fa-spinner fa-pulse fa-5x fa-fw"></i>' +
          '</div>' +
          '</div>')
        .css('overflow', 'hidden');
      $("#search-loading-icon").css('margin', '20% auto 0 auto').css('text-align', 'center');

      $.ajax({
        url: path,
        dataType: isXml ? "xml" : "json",
        async: true,
        success: function(res) {
          // get the contents from search data
          isfetched = true;
          $('.popup').detach().appendTo('.header-inner');
          var datas = isXml ? $("entry", res).map(function() {
            return {
              title: $("title", this).text(),
              content: $("content",this).text(),
              url: $("url" , this).text()
            };
          }).get() : res;
          var input = document.getElementById(search_id);
          var resultContent = document.getElementById(content_id);
          var inputEventFunction = function() {
            var searchText = input.value.trim().toLowerCase();
            var keywords = searchText.split(/[\s\-]+/);
            if (keywords.length > 1) {
              keywords.push(searchText);
            }
            var resultItems = [];
            if (searchText.length > 0) {
              // perform local searching
              datas.forEach(function(data) {
                var isMatch = false;
                var hitCount = 0;
                var searchTextCount = 0;
                var title = data.title.trim();
                var titleInLowerCase = title.toLowerCase();
                var content = data.content.trim().replace(/<[^>]+>/g,"");
                var contentInLowerCase = content.toLowerCase();
                var articleUrl = decodeURIComponent(data.url);
                var indexOfTitle = [];
                var indexOfContent = [];
                // only match articles with not empty titles
                if(title != '') {
                  keywords.forEach(function(keyword) {
                    function getIndexByWord(word, text, caseSensitive) {
                      var wordLen = word.length;
                      if (wordLen === 0) {
                        return [];
                      }
                      var startPosition = 0, position = [], index = [];
                      if (!caseSensitive) {
                        text = text.toLowerCase();
                        word = word.toLowerCase();
                      }
                      while ((position = text.indexOf(word, startPosition)) > -1) {
                        index.push({position: position, word: word});
                        startPosition = position + wordLen;
                      }
                      return index;
                    }

                    indexOfTitle = indexOfTitle.concat(getIndexByWord(keyword, titleInLowerCase, false));
                    indexOfContent = indexOfContent.concat(getIndexByWord(keyword, contentInLowerCase, false));
                  });
                  if (indexOfTitle.length > 0 || indexOfContent.length > 0) {
                    isMatch = true;
                    hitCount = indexOfTitle.length + indexOfContent.length;
                  }
                }

                // show search results

                if (isMatch) {
                  // sort index by position of keyword

                  [indexOfTitle, indexOfContent].forEach(function (index) {
                    index.sort(function (itemLeft, itemRight) {
                      if (itemRight.position !== itemLeft.position) {
                        return itemRight.position - itemLeft.position;
                      } else {
                        return itemLeft.word.length - itemRight.word.length;
                      }
                    });
                  });

                  // merge hits into slices

                  function mergeIntoSlice(text, start, end, index) {
                    var item = index[index.length - 1];
                    var position = item.position;
                    var word = item.word;
                    var hits = [];
                    var searchTextCountInSlice = 0;
                    while (position + word.length <= end && index.length != 0) {
                      if (word === searchText) {
                        searchTextCountInSlice++;
                      }
                      hits.push({position: position, length: word.length});
                      var wordEnd = position + word.length;

                      // move to next position of hit

                      index.pop();
                      while (index.length != 0) {
                        item = index[index.length - 1];
                        position = item.position;
                        word = item.word;
                        if (wordEnd > position) {
                          index.pop();
                        } else {
                          break;
                        }
                      }
                    }
                    searchTextCount += searchTextCountInSlice;
                    return {
                      hits: hits,
                      start: start,
                      end: end,
                      searchTextCount: searchTextCountInSlice
                    };
                  }

                  var slicesOfTitle = [];
                  if (indexOfTitle.length != 0) {
                    slicesOfTitle.push(mergeIntoSlice(title, 0, title.length, indexOfTitle));
                  }

                  var slicesOfContent = [];
                  while (indexOfContent.length != 0) {
                    var item = indexOfContent[indexOfContent.length - 1];
                    var position = item.position;
                    var word = item.word;
                    // cut out 100 characters
                    var start = position - 20;
                    var end = position + 80;
                    if(start < 0){
                      start = 0;
                    }
                    if (end < position + word.length) {
                      end = position + word.length;
                    }
                    if(end > content.length){
                      end = content.length;
                    }
                    slicesOfContent.push(mergeIntoSlice(content, start, end, indexOfContent));
                  }

                  // sort slices in content by search text's count and hits' count

                  slicesOfContent.sort(function (sliceLeft, sliceRight) {
                    if (sliceLeft.searchTextCount !== sliceRight.searchTextCount) {
                      return sliceRight.searchTextCount - sliceLeft.searchTextCount;
                    } else if (sliceLeft.hits.length !== sliceRight.hits.length) {
                      return sliceRight.hits.length - sliceLeft.hits.length;
                    } else {
                      return sliceLeft.start - sliceRight.start;
                    }
                  });

                  // select top N slices in content

                  var upperBound = parseInt('1');
                  if (upperBound >= 0) {
                    slicesOfContent = slicesOfContent.slice(0, upperBound);
                  }

                  // highlight title and content

                  function highlightKeyword(text, slice) {
                    var result = '';
                    var prevEnd = slice.start;
                    slice.hits.forEach(function (hit) {
                      result += text.substring(prevEnd, hit.position);
                      var end = hit.position + hit.length;
                      result += '<b class="search-keyword">' + text.substring(hit.position, end) + '</b>';
                      prevEnd = end;
                    });
                    result += text.substring(prevEnd, slice.end);
                    return result;
                  }

                  var resultItem = '';

                  if (slicesOfTitle.length != 0) {
                    resultItem += "<li><a href='" + articleUrl + "' class='search-result-title'>" + highlightKeyword(title, slicesOfTitle[0]) + "</a>";
                  } else {
                    resultItem += "<li><a href='" + articleUrl + "' class='search-result-title'>" + title + "</a>";
                  }

                  slicesOfContent.forEach(function (slice) {
                    resultItem += "<a href='" + articleUrl + "'>" +
                      "<p class=\"search-result\">" + highlightKeyword(content, slice) +
                      "...</p>" + "</a>";
                  });

                  resultItem += "</li>";
                  resultItems.push({
                    item: resultItem,
                    searchTextCount: searchTextCount,
                    hitCount: hitCount,
                    id: resultItems.length
                  });
                }
              })
            };
            if (keywords.length === 1 && keywords[0] === "") {
              resultContent.innerHTML = '<div id="no-result"><i class="fa fa-search fa-5x" /></div>'
            } else if (resultItems.length === 0) {
              resultContent.innerHTML = '<div id="no-result"><i class="fa fa-frown-o fa-5x" /></div>'
            } else {
              resultItems.sort(function (resultLeft, resultRight) {
                if (resultLeft.searchTextCount !== resultRight.searchTextCount) {
                  return resultRight.searchTextCount - resultLeft.searchTextCount;
                } else if (resultLeft.hitCount !== resultRight.hitCount) {
                  return resultRight.hitCount - resultLeft.hitCount;
                } else {
                  return resultRight.id - resultLeft.id;
                }
              });
              var searchResultList = '<ul class=\"search-result-list\">';
              resultItems.forEach(function (result) {
                searchResultList += result.item;
              })
              searchResultList += "</ul>";
              resultContent.innerHTML = searchResultList;
            }
          }

          if ('auto' === 'auto') {
            input.addEventListener('input', inputEventFunction);
          } else {
            $('.search-icon').click(inputEventFunction);
            input.addEventListener('keypress', function (event) {
              if (event.keyCode === 13) {
                inputEventFunction();
              }
            });
          }

          // remove loading animation
          $(".local-search-pop-overlay").remove();
          $('body').css('overflow', '');

          proceedsearch();
        }
      });
    }

    // handle and trigger popup window;
    $('.popup-trigger').click(function(e) {
      e.stopPropagation();
      if (isfetched === false) {
        searchFunc(path, 'local-search-input', 'local-search-result');
      } else {
        proceedsearch();
      };
    });

    $('.popup-btn-close').click(onPopupClose);
    $('.popup').click(function(e){
      e.stopPropagation();
    });
    $(document).on('keyup', function (event) {
      var shouldDismissSearchPopup = event.which === 27 &&
        $('.search-popup').is(':visible');
      if (shouldDismissSearchPopup) {
        onPopupClose();
      }
    });
  </script>





  

  

  

  
  

  
  
    <script type="text/x-mathjax-config">
      MathJax.Hub.Config({
        tex2jax: {
          inlineMath: [ ['$','$'], ["\\(","\\)"]  ],
          processEscapes: true,
          skipTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code']
        }
      });
    </script>

    <script type="text/x-mathjax-config">
      MathJax.Hub.Queue(function() {
        var all = MathJax.Hub.getAllJax(), i;
        for (i=0; i < all.length; i += 1) {
          all[i].SourceElement().parentNode.className += ' has-jax';
        }
      });
    </script>
    <script type="text/javascript" src="//cdn.bootcss.com/mathjax/2.7.1/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
  


  

  

</body>
</html>
